FILENAME: Analysis Notebook.ipynb
PROJECT: Multivariate Financial Forecasting
DATE CREATED:24-APR-20
DATE UPDATED:24-APR-20
TASK: Develop and implement a recurrent neural network
PURPOSE: Given a multivariate dataset, forecast and predict the corresponding response value for each record
INTENT: The purpose of this project is to conduct exploratory analysis of the provided data set and apply both supervised and unsupervised algorithms in order to extract meaniningful information in support for future open source analysis. The project is broken down into two separate projects, with each project having four (4) distinct phases:
PROJECT: Randomized Budget Data
Environment Setup
Data ETL
Data Exploration
Model Development
Create randomm arrays to store the test values:
YEAR +5: yr5_forecast
YEAR +4: yr4_forecast
YEAR +3: yr3_forecast
YEAR +2: yr2_forecast
YEAR +1: yr1_forecast
YEAR +0: plan
YEAR -1: approp
YEAR -2: obligate
from IPython.display import Image
from IPython.core.display import HTML
Image(filename = "data/rnn.png", width=750, height=750)
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
Import the necessary libraries needed for ETL, engineering, and export efforts
import pandas as pd
import csv
import random
import sqlite3
import itertools
import numpy as np
import datetime
import time as t
import getpass as gp
Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
Import the required ML & neural net libraries
from scipy import stats
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras.wrappers.scikit_learn import KerasRegressor
def init_array(df_length):
'''
DESCRIPTION: A function to create and return a two_dimensional array with randomized float values
'''
length = df_length
yr5_forecast= np.random.randint(low = 100000, high = 30000000, size = df_length)
yr4_random = np.random.uniform(low=0.5, high=1.5, size=df_length)
yr4_forecast = np.round(yr5_prep * yr4_random,2)
yr3_random = np.random.uniform(low=0.6, high=1.4, size=df_length)
yr3_forecast = np.round(yr4_forecast * yr3_random,2)
yr2_random = np.random.uniform(low=0.7, high=1.3, size=df_length)
yr2_forecast = np.round(yr3_plan * yr2_random,2)
yr1_random = np.random.uniform(low=0.8, high=1.2, size=df_length)
yr1_forecast = np.round(yr2_approp * yr1_random,2)
plan_random = np.random.uniform(low=0.6, high=1.3, size=df_length)
plan_val = np.round(yr1_oblig * plan_random,2)
approp_random = np.random.uniform(low=0.6, high=1.2, size=df_length)
approp_val = np.round(plan_val * approp_random,2)
oblig_random = np.random.uniform(low=0.7, high=1.0, size=df_length)
oblig_val = np.round(approp_val * oblig_random,2)
raw_df = pd.DataFrame(columns=['yr5_prep','yr4_forecast','yr3_plan','yr2_approp','yr1_oblig','yr0_exe'])
raw_df['yr5_forecast'] = yr5_forecast
raw_df['yr4_forecast'] = yr4_forecast
raw_df['yr3_forecast'] = yr3_forecast
raw_df['yr2_forecast'] = yr2_forecast
raw_df['yr1_forecast'] = yr1_forecast
raw_df['planned'] = plan_val
raw_df['appropriated'] = approp_val
raw_df['obligated'] = oblig_val
return raw_df
Start the project timer
program_start = t.time()
Set the random seed for the project in order to ensure consistent results
random.seed(6)
Create random arrays to store the test values:
Create the training array
train_df = init_array(10000)
train_df.tail(10)
col_list = list(train_df.columns)
col_list
dataset = train_df.copy()
dataset.tail(10)
import plotly.graph_objects as go
import numpy as np
np.random.seed(1)
y5 = dataset.yr5_prep
y4 = dataset.yr4_forecast
y3 = dataset.yr3_plan
y2 = dataset.yr2_approp
y1 = dataset.yr1_oblig
y0 = dataset.yr0_exe
fig = go.Figure()
fig.add_trace(go.Box(x=y0, name = "yr5_prep"))
fig.add_trace(go.Box(x=y1, name = "yr4_forecast"))
fig.add_trace(go.Box(x=y2, name = "yr3_plan"))
fig.add_trace(go.Box(x=y3, name = "yr2_approp"))
fig.add_trace(go.Box(x=y4, name = "yr1_oblig"))
fig.add_trace(go.Box(x=y5, name = "yr0_exe"))
fig.show()
budget_pair = train_df[['yr5_prep','yr4_forecast','yr3_plan','yr2_approp','yr1_oblig','yr0_exe']]
sns.set(style="ticks", color_codes=True)
sns.pairplot(budget_pair)
Convert dataframe to numpy arrays
x=dataset.iloc[:, 0:5].to_numpy()
y=dataset.iloc[:,5].to_numpy()
x
y=np.reshape(y, (-1,1))
y
Scale the data from 0 -> 1
scaler_x = MinMaxScaler()
scaler_y = MinMaxScaler()
print(scaler_x.fit(x))
xscale=scaler_x.transform(x)
print(scaler_y.fit(y))
yscale=scaler_y.transform(y)
Segregate master data to 'train', 'test', 'split'
X_train, X_test, y_train, y_test = train_test_split(xscale, yscale)
Verify the array shape
X_train.shape
y_train is the response variable
y_train.shape
model = Sequential()
model.add(Dense(10, input_dim=5, kernel_initializer='normal', activation='relu'))
model.add(Dense(5, activation='relu'))
model.add(Dense(1, activation='linear'))
model.summary()
model.compile(loss='mse', optimizer='adam', metrics=['mse','mae'])
history = model.fit(X_train, y_train, epochs=150, batch_size=50, verbose=1, validation_split=0.2)
fig = go.Figure()
fig.add_trace(go.Scatter(y=history.history['loss'],
mode='lines',
name='Train'))
fig.add_trace(go.Scatter(y=history.history['val_loss'],
mode='lines+markers',
name='Validation'))
fig.update_layout(
autosize=False,
width=1500,
height=750,
title = "Train vs. Validation Loss Test",
xaxis=dict(
title_text="No. of epochs",
titlefont=dict(size=20),
),
yaxis=dict(
title_text="Loss Value",
titlefont=dict(size=20),
)
)
fig.show()
Create a new array with dummy data and test the model's effeftiveness against it
predict_full = init_array(25000)
valid_df = predict_full.iloc[:,:-1]
valid_df.tail(100)
Convert the dataframe to a two dimensional numpy array
valid_array = valid_df.to_numpy()
valid_array
Validate the shape of the newly created array
valid_array.shape
predict_val = model.predict(valid_array)
predict_val
Merge the numpy predictor array as a standalone column to the predict_full dataframe
predict_full['predict_values'] = predict_val
predict_full.tail(10)
Calculate the difference in actual ('yr0_exe') and predicted ('predict_values') model values and assign a difference value for each record in the 'delta' column
predict_full['delta'] = (predict_full['yr0_exe'] - predict_full['predict_values']) / predict_full['yr0_exe']
predict_full.tail(10)
Display the histogram of the delta values (i.e distribution)
fig = px.histogram(predict_full, x="delta",marginal="rug", # can be `box`, `violin`
hover_data=predict_full.columns, color_discrete_sequence=['indianred'], opacity = 0.5)
fig.update_layout(
autosize=True,
title = "Actual vs Prediction value Historgram ")
fig.show()
Display distribution of box & whisker plot for response and predict values
yr5 = dataset.yr0_exe
yr6 = predict_full.predict_values
fig = go.Figure()
fig.add_trace(go.Box(x=yr5, name = "yr0_exe"))
fig.add_trace(go.Box(x=yr6, name = "predict_values"))
fig.show()
Retrieve the statistical parameters for the linear model
x = predict_full['yr0_exe']
y = predict_full['predict_values']
slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)
print(" Slope: {}\n Intercept: {}\n R-squared: {}\n P-Value: {}\n Standard Error: {}". format(slope, intercept, r_value, p_value, std_err))
Plot the response values (original) against the predicted values
fig = px.scatter(predict_full, x="yr0_exe", y="predict_values", trendline="ols", opacity=0.25, color_discrete_sequence=['green'])
fig.update_layout(
autosize=False,
width=1000,
height=750,
title = "Response values vs predicted values scatterplot",
xaxis=dict(
title_text="yr0_exe values (Response Values)",
titlefont=dict(size=20),
),
yaxis=dict(
title_text="predict_values (Predicted Values)",
titlefont=dict(size=20),
)
)
fig.show()
df_size = len(predict_full)
df_size
lat_random = np.random.uniform(low=19.50139, high=64.85694, size=df_size)
long_random = np.random.uniform(low=-161.75583, high=-68.01197, size=df_size)
len(long_random)
geo_df = predict_full.copy()
geo_df['latitude'] = lat_random
geo_df['longitude'] = long_random
geo_df.tail(10)
fig = px.scatter(geo_df[:50], x="longitude", y="latitude", opacity=0.25, size = "predict_values", color_discrete_sequence=['green'])
fig.update_layout(
autosize=False,
width=1000,
height=750,
title = "US Map of Budget points",
xaxis=dict(
title_text="Longitude",
titlefont=dict(size=20),
),
yaxis=dict(
title_text="Latitude",
titlefont=dict(size=20),
)
)
fig.show()
from urllib.request import urlopen
import json
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
counties = json.load(response)
program_end = t.time() - program_start
elapsed = round(program_end, 2)
print("Total time for program execution is {} seconds".format(elapsed))
import descartes
import geopandas gpd